import numpy as np
import plotly.express as px
import os
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import umap
from pyensembl import EnsemblRelease
from itertools import product
from Bio.Seq import translate
import pickle
import ibis
ibis.set_backend("duckdb")
ibis.options.interactive = True
from ibis import _
import ibis.selectors as s
import warnings
warnings.filterwarnings('ignore')
/Users/jordanramsdell/mambaforge/envs/ml_ibis/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html from .autonotebook import tqdm as notebook_tqdm
def construct_databases(base_loc):
mappings = {}
for directory in os.listdir(base_loc):
if directory.startswith("."):
continue
loc = base_loc + "/" + directory
t = ibis.read_parquet(loc)
mappings["t_" + directory] = t
return mappings
# Load parquet databases into local variables
locals().update(construct_databases("../../../data/open_targets/"))
def vectorize_and_embed(docs, n_components=3, use_densmap=False,
metric='euclidean', n_neighbors=15, vectorizer_fun=lambda: CountVectorizer(stop_words='english')):
counts = vectorizer_fun().fit_transform(docs)
mapper = umap.UMAP(n_components=n_components, densmap=use_densmap, metric=metric, random_state=42).fit(counts)
return mapper
def construct_scatterplot(df, mapper, hover_name, color=None, hover_data=None):
embeddings = mapper.embedding_.T
df["x"], df["y"], df["z"] = embeddings
fig = px.scatter_3d(df, x="x", y="y", z="z", color=color,
hover_name=hover_name, hover_data=hover_data)
fig.update_layout(margin=dict(l=0, r=0, t=0, b=0))
fig.update_traces(marker=dict(size=2))
return fig.show()
# Here's what the table looks like
t_diseases
┏━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ ┃ id ┃ code ┃ dbXRefs ┃ description ┃ name ┃ directLocationIds ┃ obsoleteTerms ┃ parents ┃ synonyms ┃ ancestors ┃ descendants ┃ children ┃ therapeuticAreas ┃ indirectLocationIds ┃ ontology ┃ ┡━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ │ string │ string │ array<string> │ string │ string │ array<string> │ array<string> │ array<string> │ struct<hasBroadSynonym: array<string>, hasExactSynonym: array<string>, hasNarro… │ array<string> │ array<string> │ array<string> │ array<string> │ array<string> │ struct<isTherapeuticArea: boolean, leaf: boolean, sources: struct<url: string, … │ ├─────────────┼──────────────────────────────────────────┼──────────────────────────────────────────┼──────────────────────────────────────────────────────────────────────────────────┼──────────────────────────────────────────────────────┼───────────────────┼───────────────┼──────────────────────────────────┼──────────────────────────────────────────────────────────────────────────────────┼────────────────────────────────────────────┼─────────────────────────────────────────┼────────────────────────────────────────┼───────────────────────────────────────────┼─────────────────────┼──────────────────────────────────────────────────────────────────────────────────┤ │ DOID_7551 │ http://purl.obolibrary.org/obo/DOID_7551 │ ['ICD9:098.89', 'MeSH:D006069', ... +17] │ A primary bacterial infectious disease that is a sexually transmitted infection… │ gonorrhea │ NULL │ NULL │ ['EFO_0003955', 'MONDO_0000314'] │ {'hasBroadSynonym': None, 'hasExactSynonym': [...], ... +2} │ ['EFO_0000512', 'MONDO_0100336', ... +8] │ [] │ [] │ ['MONDO_0100336', 'OTAR_0000017', ... +1] │ NULL │ {'isTherapeuticArea': False, 'leaf': True, ... +1} │ │ EFO_0004254 │ http://www.ebi.ac.uk/efo/EFO_0004254 │ ['NCIt:C34645', 'NCIT:C34645', ... +9] │ A slowly progressive inflammation of the glomeruli characterized by immune comp… │ membranous glomerulonephritis │ NULL │ NULL │ ['MONDO_0002462'] │ {'hasBroadSynonym': None, 'hasExactSynonym': [...], ... +2} │ ['EFO_1002050', 'EFO_0009690', ... +3] │ ['MONDO_0013860'] │ ['MONDO_0013860'] │ ['EFO_0009690'] │ NULL │ {'isTherapeuticArea': False, 'leaf': False, ... +1} │ │ EFO_0005189 │ http://www.ebi.ac.uk/efo/EFO_0005189 │ ['SNOMEDCT:74427007'] │ The respiratory quotient (or RQ or respiratory coefficient), is a dimensionless… │ respiratory quotient │ NULL │ NULL │ ['EFO_0005115'] │ NULL │ ['EFO_0001444', 'EFO_0005115'] │ [] │ [] │ ['EFO_0001444'] │ NULL │ {'isTherapeuticArea': False, 'leaf': True, ... +1} │ │ EFO_0005853 │ http://www.ebi.ac.uk/efo/EFO_0005853 │ [] │ short or long term physiological response of an organism, eg in terms of deposi… │ response to silica exposure │ NULL │ NULL │ ['GO_0050896'] │ NULL │ ['GO_0050896', 'GO_0008150'] │ [] │ [] │ ['GO_0008150'] │ NULL │ {'isTherapeuticArea': False, 'leaf': True, ... +1} │ │ EFO_0006317 │ http://www.ebi.ac.uk/efo/EFO_0006317 │ [] │ Any process that results in a change in state or activity of a cell or an organ… │ response to thiopurine │ NULL │ NULL │ ['GO_0042493'] │ NULL │ ['GO_0050896', 'GO_0008150', ... +1] │ ['EFO_0007853'] │ ['EFO_0007853'] │ ['GO_0008150'] │ NULL │ {'isTherapeuticArea': False, 'leaf': False, ... +1} │ │ EFO_0007229 │ http://www.ebi.ac.uk/efo/EFO_0007229 │ ['DOID:12053', 'NCIT:C2967', ... +11] │ An opportunistic mycosis that results_in fungal infection and has_material_basi… │ cryptococcosis │ NULL │ NULL │ ['EFO_0001067', 'MONDO_0002312'] │ {'hasBroadSynonym': None, 'hasExactSynonym': [...], ... +2} │ ['MONDO_0100336', 'MONDO_0002312', ... +3] │ ['EFO_0007228'] │ ['EFO_0007228'] │ ['MONDO_0100336', 'EFO_0005741'] │ NULL │ {'isTherapeuticArea': False, 'leaf': False, ... +1} │ │ EFO_0007391 │ http://www.ebi.ac.uk/efo/EFO_0007391 │ ['DOID:3106', 'MESH:D009349', ... +5] │ Infections caused by nematode larvae which never develop into the adult stage a… │ Nematoda infectious disease │ NULL │ NULL │ ['EFO_1001342'] │ {'hasBroadSynonym': None, 'hasExactSynonym': [...], ... +2} │ ['MONDO_0100336', 'EFO_0005741', ... +2] │ ['EFO_0007154', 'EFO_0007253', ... +23] │ ['EFO_0007253', 'EFO_0007468', ... +1] │ ['MONDO_0100336', 'EFO_0005741'] │ NULL │ {'isTherapeuticArea': False, 'leaf': False, ... +1} │ │ EFO_0008080 │ http://www.ebi.ac.uk/efo/EFO_0008080 │ [] │ quantification of the volume of cerebrospinal fluid in the brain, usually throu… │ cerebrospinal fluid volume measurement │ NULL │ NULL │ ['EFO_0006930'] │ NULL │ ['EFO_0005052', 'EFO_0001444', ... +2] │ ['EFO_0008367'] │ ['EFO_0008367'] │ ['EFO_0001444'] │ NULL │ {'isTherapeuticArea': False, 'leaf': False, ... +1} │ │ EFO_0008167 │ http://www.ebi.ac.uk/efo/EFO_0008167 │ [] │ quantification of the amount of interleukin 1 Receptor accessory protein in a s… │ interleukin 1 Receptor accessory protein measurement │ NULL │ NULL │ ['EFO_0007937'] │ NULL │ ['EFO_0004747', 'EFO_0001444', ... +1] │ [] │ [] │ ['EFO_0001444'] │ NULL │ {'isTherapeuticArea': False, 'leaf': True, ... +1} │ │ EFO_0008181 │ http://www.ebi.ac.uk/efo/EFO_0008181 │ [] │ quantification of the amount of interleukin 23 receptor in a sample │ interleukin 23 receptor measurement │ NULL │ NULL │ ['EFO_0007937'] │ NULL │ ['EFO_0004747', 'EFO_0001444', ... +1] │ [] │ [] │ ['EFO_0001444'] │ NULL │ {'isTherapeuticArea': False, 'leaf': True, ... +1} │ │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ └─────────────┴──────────────────────────────────────────┴──────────────────────────────────────────┴──────────────────────────────────────────────────────────────────────────────────┴──────────────────────────────────────────────────────┴───────────────────┴───────────────┴──────────────────────────────────┴──────────────────────────────────────────────────────────────────────────────────┴────────────────────────────────────────────┴─────────────────────────────────────────┴────────────────────────────────────────┴───────────────────────────────────────────┴─────────────────────┴──────────────────────────────────────────────────────────────────────────────────┘
# For this demo, only considering protein-coding targets
protein_coding_targets = t_targets.filter(_.biotype == 'protein_coding' and _.go != None)
# Here is what baseline expresions normally looks like
# Note how deeply nested it is
t_baselineExpression
┏━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ ┃ id ┃ tissues ┃ ┡━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ │ string │ array<struct<efo_code: string, label: string, organs: array<string>, anatomical… │ ├─────────────────┼──────────────────────────────────────────────────────────────────────────────────┤ │ ENSG00000020219 │ [{...}, {...}, ... +108] │ │ ENSG00000059588 │ [{...}, {...}, ... +108] │ │ ENSG00000070182 │ [{...}, {...}, ... +118] │ │ ENSG00000070366 │ [{...}, {...}, ... +118] │ │ ENSG00000072071 │ [{...}, {...}, ... +117] │ │ ENSG00000073536 │ [{...}, {...}, ... +118] │ │ ENSG00000075290 │ [{...}, {...}, ... +108] │ │ ENSG00000083454 │ [{...}, {...}, ... +118] │ │ ENSG00000083782 │ [{...}, {...}, ... +117] │ │ ENSG00000086200 │ [{...}, {...}, ... +118] │ │ … │ … │ └─────────────────┴──────────────────────────────────────────────────────────────────────────────────┘
# Here it is unpacked
(t_baselineExpression
.select("id", _.tissues.unnest())
.unpack("tissues")
)
┏━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ ┃ id ┃ efo_code ┃ label ┃ organs ┃ anatomical_systems ┃ rna ┃ protein ┃ ┡━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ │ string │ string │ string │ array<string> │ array<string> │ struct<value: float64, zscore: int32, level: int32, unit: string> │ struct<reliability: boolean, level: int32, cell_type: array<struct<name: string… │ ├─────────────────┼────────────────┼──────────────────────────────────────────────┼──────────────────────────────────────────────────┼───────────────────────────────────────────┼───────────────────────────────────────────────────────────────────┼──────────────────────────────────────────────────────────────────────────────────┤ │ ENSG00000020219 │ UBERON_0012249 │ ectocervix │ ['reproductive organ', 'reproductive structure'] │ ['reproductive system'] │ {'value': 2.0, 'zscore': -1, ... +2} │ {'reliability': False, 'level': -1, ... +1} │ │ ENSG00000020219 │ CL_0000235 │ macrophage │ ['immune organ', 'blood'] │ ['immune system', 'hematopoietic system'] │ {'value': 0.0, 'zscore': -1, ... +2} │ {'reliability': False, 'level': -1, ... +1} │ │ ENSG00000020219 │ CL_0000787 │ memory B cell │ ['immune organ', 'blood'] │ ['immune system', 'hematopoietic system'] │ {'value': 0.0, 'zscore': -1, ... +2} │ {'reliability': False, 'level': -1, ... +1} │ │ ENSG00000020219 │ CL_0000815 │ regulatory T cell │ ['immune organ', 'blood'] │ ['immune system', 'hematopoietic system'] │ {'value': 0.0, 'zscore': -1, ... +2} │ {'reliability': False, 'level': -1, ... +1} │ │ ENSG00000020219 │ UBERON_0000948 │ heart │ ['heart'] │ ['circulatory system'] │ {'value': 0.0, 'zscore': -1, ... +2} │ {'reliability': False, 'level': -1, ... +1} │ │ ENSG00000020219 │ UBERON_0001154 │ vermiform appendix │ ['intestine', 'colon'] │ ['digestive system'] │ {'value': 0.0, 'zscore': -1, ... +2} │ {'reliability': False, 'level': -1, ... +1} │ │ ENSG00000020219 │ UBERON_0001876 │ amygdala │ ['brain'] │ ['nervous system'] │ {'value': 0.0, 'zscore': -1, ... +2} │ {'reliability': False, 'level': -1, ... +1} │ │ ENSG00000020219 │ UBERON_0002190 │ subcutaneous adipose tissue │ ['connective tissue'] │ ['integumental system'] │ {'value': 0.0, 'zscore': -1, ... +2} │ {'reliability': False, 'level': -1, ... +1} │ │ ENSG00000020219 │ CL_0002618 │ endothelial cell of umbilical vein (resting) │ ['blood'] │ ['circulatory system'] │ {'value': 0.0, 'zscore': -1, ... +2} │ {'reliability': False, 'level': -1, ... +1} │ │ ENSG00000020219 │ UBERON_0001873 │ caudate nucleus │ ['brain'] │ ['nervous system'] │ {'value': 1.0, 'zscore': -1, ... +2} │ {'reliability': False, 'level': -1, ... +1} │ │ … │ … │ … │ … │ … │ … │ … │ └─────────────────┴────────────────┴──────────────────────────────────────────────┴──────────────────────────────────────────────────┴───────────────────────────────────────────┴───────────────────────────────────────────────────────────────────┴──────────────────────────────────────────────────────────────────────────────────┘
# Here's an example where we map targets onto expression vectors
query_baseline = (t_baselineExpression
.select("id", _.tissues.unnest())
.unpack("tissues")
.select(id=_.id, system=_.anatomical_systems[0], zscore=_.rna.zscore)
.filter(~_.system.isnull()) # Ignore expresison values where system is null
.group_by(("id", "system"))
.agg(mean_zscore = _.zscore.mean())
# Pivot and transf
.mutate(system = _.system.replace(" ", "_"))
.pivot_wider(id_cols="id", names_from="system", values_from="mean_zscore")
.drop("sensory_system") # most are null anyway
)
query_baseline
┏━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓ ┃ id ┃ musculoskeletal_system ┃ integumental_system ┃ circulatory_system ┃ renal_system ┃ connective_tissue ┃ hematopoietic_system ┃ hemolymphoid_system ┃ digestive_system ┃ respiratory_system ┃ external_soft_tissue_zone ┃ nervous_system ┃ immune_system ┃ anatomical_junction ┃ endocrine_system ┃ anatomical_wall ┃ reproductive_system ┃ ┡━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩ │ string │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ ├─────────────────┼────────────────────────┼─────────────────────┼────────────────────┼──────────────┼───────────────────┼──────────────────────┼─────────────────────┼──────────────────┼────────────────────┼───────────────────────────┼────────────────┼───────────────┼─────────────────────┼──────────────────┼─────────────────┼─────────────────────┤ │ ENSG00000271127 │ -1.0 │ -1.0 │ -1.000000 │ -1.000000 │ -1.0 │ -1.0 │ -1.000000 │ -1.000000 │ -1.00 │ -1.0 │ -1.000000 │ -1.0000 │ -1.0 │ -1.000000 │ -1.0 │ -1.000000 │ │ ENSG00000229442 │ -1.0 │ -0.6 │ -1.000000 │ -1.000000 │ -1.0 │ -1.0 │ -1.000000 │ -1.000000 │ -1.00 │ -1.0 │ -1.000000 │ -1.0000 │ -1.0 │ -1.000000 │ -1.0 │ -1.000000 │ │ ENSG00000095587 │ 0.5 │ -1.0 │ 0.666667 │ -1.000000 │ -1.0 │ -0.3 │ -0.142857 │ -0.437500 │ -1.00 │ -1.0 │ 2.055556 │ -0.8125 │ 0.0 │ -0.625000 │ -1.0 │ -0.900000 │ │ ENSG00000219257 │ -1.0 │ -1.0 │ -1.000000 │ -1.000000 │ -1.0 │ -1.0 │ -1.000000 │ -1.000000 │ -1.00 │ -1.0 │ -1.000000 │ -1.0000 │ -1.0 │ -1.000000 │ -1.0 │ -1.000000 │ │ ENSG00000158806 │ -1.0 │ -1.0 │ -0.900000 │ -0.333333 │ -1.0 │ -0.4 │ 0.000000 │ -0.882353 │ -1.00 │ -1.0 │ 1.800000 │ -0.8750 │ -1.0 │ 0.222222 │ -1.0 │ -0.750000 │ │ ENSG00000179941 │ 0.5 │ 0.2 │ -0.100000 │ 0.000000 │ 2.0 │ 0.1 │ -0.571429 │ 0.058824 │ -0.25 │ 0.0 │ 0.000000 │ -0.9375 │ 0.0 │ -0.375000 │ 0.0 │ 0.000000 │ │ ENSG00000204685 │ 1.5 │ 0.6 │ -0.222222 │ 0.666667 │ 0.0 │ 0.2 │ -0.428571 │ 0.312500 │ 0.00 │ 0.0 │ -0.388889 │ -1.0000 │ 0.0 │ 0.000000 │ 0.0 │ 0.500000 │ │ ENSG00000244165 │ -0.5 │ 0.2 │ -0.800000 │ -0.333333 │ 0.0 │ -0.7 │ 0.285714 │ -0.235294 │ -0.50 │ 0.0 │ 0.750000 │ -0.5625 │ 0.0 │ -0.250000 │ 0.0 │ -0.083333 │ │ ENSG00000269316 │ -1.0 │ -1.0 │ -1.000000 │ -1.000000 │ -1.0 │ -1.0 │ -1.000000 │ -1.000000 │ -1.00 │ -1.0 │ -1.000000 │ -1.0000 │ -1.0 │ -1.000000 │ -1.0 │ -1.000000 │ │ ENSG00000235049 │ -1.0 │ -1.0 │ -0.888889 │ -1.000000 │ -1.0 │ -1.0 │ -1.000000 │ -0.875000 │ -1.00 │ -1.0 │ 0.055556 │ -1.0000 │ -1.0 │ -0.250000 │ -1.0 │ -0.900000 │ │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ └─────────────────┴────────────────────────┴─────────────────────┴────────────────────┴──────────────┴───────────────────┴──────────────────────┴─────────────────────┴──────────────────┴────────────────────┴───────────────────────────┴────────────────┴───────────────┴─────────────────────┴──────────────────┴─────────────────┴─────────────────────┘
# Finally, we average over the associated target vectors for each disease
disease_expression_query = (t_associationByOverallDirect
.select("diseaseId", "targetId")
# Join gene expression values to diseases (diseases have one or more associated genes)
.left_join(query_baseline, _.targetId == query_baseline.id)
.drop("targetId", "id")
# Average over these
.group_by("diseaseId")
.agg(s.across(s.numeric(), _.mean())))
disease_expression_query
┏━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓ ┃ diseaseId ┃ musculoskeletal_system ┃ integumental_system ┃ circulatory_system ┃ renal_system ┃ connective_tissue ┃ hematopoietic_system ┃ hemolymphoid_system ┃ digestive_system ┃ respiratory_system ┃ external_soft_tissue_zone ┃ nervous_system ┃ immune_system ┃ anatomical_junction ┃ endocrine_system ┃ anatomical_wall ┃ reproductive_system ┃ ┡━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩ │ string │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ ├─────────────┼────────────────────────┼─────────────────────┼────────────────────┼──────────────┼───────────────────┼──────────────────────┼─────────────────────┼──────────────────┼────────────────────┼───────────────────────────┼────────────────┼───────────────┼─────────────────────┼──────────────────┼─────────────────┼─────────────────────┤ │ EFO_0000588 │ -0.084173 │ -0.093347 │ -0.245621 │ -0.161626 │ 0.181818 │ -0.250808 │ -0.134073 │ -0.094947 │ -0.323757 │ -0.201613 │ -0.406610 │ -0.407513 │ -0.289899 │ -0.429463 │ 0.020202 │ -0.299737 │ │ EFO_0004253 │ -0.216625 │ -0.217128 │ -0.331598 │ 0.269521 │ -0.261965 │ -0.412594 │ -0.378194 │ -0.148235 │ -0.499790 │ -0.307305 │ -0.409484 │ -0.565019 │ -0.367758 │ -0.381647 │ -0.176322 │ -0.406633 │ │ EFO_0004254 │ -0.143302 │ -0.205607 │ -0.226618 │ -0.051921 │ -0.084112 │ -0.368224 │ -0.210058 │ -0.121793 │ -0.356957 │ -0.255452 │ -0.433729 │ -0.454245 │ -0.352025 │ -0.410696 │ 0.012461 │ -0.388474 │ │ EFO_0005239 │ -0.250000 │ -0.137143 │ -0.261429 │ -0.204762 │ 0.128571 │ -0.447143 │ -0.285714 │ -0.179149 │ -0.327381 │ -0.028571 │ -0.641100 │ -0.443750 │ -0.457143 │ -0.391468 │ 0.257143 │ -0.367619 │ │ EFO_0005243 │ 0.045455 │ -0.119192 │ -0.088552 │ -0.333333 │ -0.030303 │ -0.051515 │ -0.112554 │ -0.140931 │ -0.239899 │ -0.424242 │ -0.301515 │ -0.469697 │ -0.242424 │ -0.442340 │ -0.212121 │ -0.337879 │ │ EFO_0005252 │ -0.165789 │ -0.248421 │ -0.296725 │ -0.163158 │ -0.331579 │ -0.232632 │ -0.245865 │ -0.297252 │ -0.498684 │ -0.357895 │ -0.106053 │ -0.526316 │ -0.305263 │ -0.419371 │ -0.247368 │ -0.432719 │ │ EFO_0005272 │ -0.187500 │ -0.325000 │ -0.600000 │ 0.083333 │ -0.750000 │ -0.800000 │ -0.750000 │ -0.465074 │ -0.875000 │ -0.375000 │ -0.597222 │ -0.859375 │ -0.625000 │ -0.517361 │ -0.250000 │ -0.568750 │ │ EFO_0005407 │ -0.159231 │ -0.316872 │ -0.360769 │ -0.311282 │ -0.211094 │ -0.412327 │ -0.332308 │ -0.307460 │ -0.512821 │ -0.410769 │ 0.010661 │ -0.555566 │ -0.359014 │ -0.449744 │ -0.235747 │ -0.454124 │ │ EFO_0007885 │ -0.062500 │ -0.250000 │ -0.297222 │ -0.041667 │ 1.125000 │ -0.012500 │ 0.089286 │ -0.147059 │ -0.468750 │ -0.625000 │ -0.525000 │ 0.148438 │ -0.500000 │ -0.194444 │ -0.375000 │ -0.575000 │ │ EFO_0007893 │ 0.200000 │ -0.320000 │ -0.224444 │ 0.066667 │ 0.200000 │ -0.440000 │ -0.400000 │ 0.163971 │ 0.350000 │ 0.400000 │ 0.107778 │ -0.512500 │ -0.400000 │ -0.619444 │ -0.200000 │ -0.343333 │ │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ └─────────────┴────────────────────────┴─────────────────────┴────────────────────┴──────────────┴───────────────────┴──────────────────────┴─────────────────────┴──────────────────┴────────────────────┴───────────────────────────┴────────────────┴───────────────┴─────────────────────┴──────────────────┴─────────────────┴─────────────────────┘
disease_label_by_highest_expressed_system = (disease_expression_query
.pivot_longer(~s.c("diseaseId"), values_to="expression", names_to="system")
.group_by("diseaseId")
.agg(most_expressed_in_system=_.system.argmax(_.expression)))
disease_label_by_highest_expressed_system
┏━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ ┃ diseaseId ┃ most_expressed_in_system ┃ ┡━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ │ string │ string │ ├─────────────┼───────────────────────────┤ │ EFO_0000673 │ connective_tissue │ │ EFO_0004346 │ connective_tissue │ │ EFO_0007859 │ integumental_system │ │ EFO_1000049 │ anatomical_wall │ │ EFO_1000054 │ hemolymphoid_system │ │ EFO_1000058 │ hemolymphoid_system │ │ EFO_1000066 │ anatomical_wall │ │ EFO_1001478 │ anatomical_wall │ │ EFO_1001491 │ connective_tissue │ │ EFO_1001498 │ external_soft_tissue_zone │ │ … │ … │ └─────────────┴───────────────────────────┘
query_final = (t_diseases
.dropna("description") # don't bother if it doesn't have a description
.select("name", diseaseId="id", desc="description")
.inner_join(disease_label_by_highest_expressed_system, "diseaseId")
.inner_join(disease_expression_query, "diseaseId"))
query_final
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┓ ┃ name ┃ diseaseId ┃ desc ┃ most_expressed_in_system ┃ musculoskeletal_system ┃ integumental_system ┃ circulatory_system ┃ renal_system ┃ connective_tissue ┃ hematopoietic_system ┃ hemolymphoid_system ┃ digestive_system ┃ respiratory_system ┃ external_soft_tissue_zone ┃ nervous_system ┃ immune_system ┃ anatomical_junction ┃ endocrine_system ┃ anatomical_wall ┃ reproductive_system ┃ ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━┩ │ string │ string │ string │ string │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ float64 │ ├──────────────────────────────────────────────────────┼─────────────┼──────────────────────────────────────────────────────────────────────────────────┼──────────────────────────┼────────────────────────┼─────────────────────┼────────────────────┼──────────────┼───────────────────┼──────────────────────┼─────────────────────┼──────────────────┼────────────────────┼───────────────────────────┼────────────────┼───────────────┼─────────────────────┼──────────────────┼─────────────────┼─────────────────────┤ │ gonorrhea │ DOID_7551 │ A primary bacterial infectious disease that is a sexually transmitted infection… │ connective_tissue │ -0.096844 │ -0.124075 │ -0.269214 │ -0.183644 │ 0.082471 │ -0.241006 │ -0.171879 │ -0.138369 │ -0.388618 │ -0.267145 │ -0.324140 │ -0.419989 │ -0.315517 │ -0.397776 │ -0.165230 │ -0.310319 │ │ respiratory quotient │ EFO_0005189 │ The respiratory quotient (or RQ or respiratory coefficient), is a dimensionless… │ musculoskeletal_system │ 2.000000 │ -0.400000 │ -0.400000 │ -0.666667 │ 0.000000 │ 0.300000 │ 0.000000 │ -0.352941 │ -0.500000 │ 0.000000 │ -0.800000 │ 0.437500 │ -1.000000 │ -0.333333 │ 0.000000 │ 0.083333 │ │ response to silica exposure │ EFO_0005853 │ short or long term physiological response of an organism, eg in terms of deposi… │ connective_tissue │ -0.500000 │ 0.266667 │ -0.733333 │ -0.222222 │ 0.666667 │ 0.266667 │ 0.571429 │ 0.058824 │ -0.138889 │ -0.333333 │ -0.750000 │ 0.479167 │ -0.666667 │ -0.185185 │ 0.333333 │ -0.416667 │ │ response to thiopurine │ EFO_0006317 │ Any process that results in a change in state or activity of a cell or an organ… │ hemolymphoid_system │ 0.250000 │ 0.250000 │ -0.426389 │ 0.083333 │ -0.125000 │ -0.112500 │ 0.482143 │ 0.209099 │ 0.125000 │ -0.375000 │ -0.402778 │ 0.078125 │ -0.875000 │ -0.588542 │ -0.375000 │ -0.741667 │ │ cryptococcosis │ EFO_0007229 │ An opportunistic mycosis that results_in fungal infection and has_material_basi… │ hemolymphoid_system │ -0.274286 │ -0.309714 │ -0.405206 │ -0.312381 │ -0.468571 │ -0.230857 │ -0.031837 │ -0.155399 │ -0.220000 │ -0.451429 │ -0.514404 │ -0.271786 │ -0.537143 │ -0.509762 │ -0.091429 │ -0.504450 │ │ Nematoda infectious disease │ EFO_0007391 │ Infections caused by nematode larvae which never develop into the adult stage a… │ anatomical_wall │ -0.345930 │ -0.247674 │ -0.374354 │ -0.184109 │ -0.255814 │ -0.322674 │ -0.178571 │ -0.090544 │ -0.357558 │ -0.325581 │ -0.578775 │ -0.430596 │ -0.430233 │ -0.452116 │ 0.005814 │ -0.487350 │ │ interleukin 1 Receptor accessory protein measurement │ EFO_0008167 │ quantification of the amount of interleukin 1 Receptor accessory protein in a s… │ hematopoietic_system │ -0.750000 │ -0.500000 │ -0.800000 │ -0.500000 │ -0.500000 │ 0.300000 │ -0.357143 │ -0.676471 │ -0.750000 │ -1.000000 │ -0.925000 │ -0.593750 │ -1.000000 │ -0.555556 │ -1.000000 │ -0.666667 │ │ interleukin 23 receptor measurement │ EFO_0008181 │ quantification of the amount of interleukin 23 receptor in a sample │ hemolymphoid_system │ -0.500000 │ -0.650000 │ -0.944444 │ -0.666667 │ -0.500000 │ -0.325000 │ -0.035714 │ -0.133272 │ -0.562500 │ -1.000000 │ -0.805556 │ -0.312500 │ -1.000000 │ -0.315972 │ -0.750000 │ -0.354167 │ │ atypical femoral fracture │ EFO_0009960 │ Stress or insufficency fractures occurring in the femoral shaft, typically in r… │ hemolymphoid_system │ -0.333333 │ -0.133333 │ -0.633333 │ -0.444444 │ -0.666667 │ 0.133333 │ 0.238095 │ -0.607843 │ -0.750000 │ -0.333333 │ -0.816667 │ 0.125000 │ -0.666667 │ -0.666667 │ 0.000000 │ -0.750000 │ │ CD40 measurement │ EFO_0010586 │ quantification of the amount of CD40 in a sample │ digestive_system │ -0.750000 │ 0.500000 │ -0.400000 │ 0.333333 │ -0.500000 │ -0.600000 │ 0.214286 │ 0.705882 │ 0.458333 │ 0.500000 │ -0.875000 │ -0.468750 │ -0.500000 │ -0.111111 │ 0.000000 │ -0.083333 │ │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ … │ └──────────────────────────────────────────────────────┴─────────────┴──────────────────────────────────────────────────────────────────────────────────┴──────────────────────────┴────────────────────────┴─────────────────────┴────────────────────┴──────────────┴───────────────────┴──────────────────────┴─────────────────────┴──────────────────┴────────────────────┴───────────────────────────┴────────────────┴───────────────┴─────────────────────┴──────────────────┴─────────────────┴─────────────────────┘
df = query_final.to_pandas()
# Adding index (for debugging purposes)
df["index"] = list(range(len(df)))
disease_desc_mapper = vectorize_and_embed(df["desc"],
metric='hellinger',
vectorizer_fun=lambda: TfidfVectorizer(stop_words='english', norm='l1'))
construct_scatterplot(df, disease_desc_mapper, hover_name="name",
color="most_expressed_in_system", hover_data=["diseaseId", "index"])
expression_vectors = \
(query_final
.select(s.numeric())
.fillna(0)
.execute())
disease_expression_mapper = umap.UMAP(n_components=3, metric="euclidean", random_state=42).fit(expression_vectors)
construct_scatterplot(df, disease_expression_mapper, hover_name="name",
color="most_expressed_in_system", hover_data=["diseaseId", "index"])
# Will probably need to look into using an lil_matrix instead...
pickle.dump(disease_expression_mapper, open("models/disease_expression_mapper.sav", 'wb'))
pickle.dump(disease_desc_mapper, open("models/disease_desc_mapper.sav", 'wb'))
pickle.dump(df, open("models/disease_df.sav", 'wb'))
# pickle.dump(disease_names, open("models/disease_names.sav", 'wb'))